# **MODIFY THIS CHUNK**
library(here)
kundaje_dir <- trimws(readr::read_lines("../../AK_PROJ_DIR.txt"))
doc_id      <- "01"
out         <- here("output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)
figout      <- here("figures/03-chrombpnet/03-syntax", doc_id, "/"); dir.create(figout, recursive = TRUE)

1 Overview

In this notebook, we do global investigations of the de novo motifs:

  • computing stats on the motif instances such as frequencies per peak
  • summarize each motif’s distribution of distances to nucleosome dyads and peak flanks
  • distribution of motif instances in genomic features
  • distribution of motif instances in tissues and tissue compartments

2 Set up

library(glue)
library(scales)
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(plotly)
library(ggseqlogo)

script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))

ggplot2::theme_set(theme_BOR())

3 Load & wrangle data

Cluster metadata:

cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))
## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl  (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrombpnet_models_keep <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"),
                                   col_names = c("Cluster", "Folds_keep", "Cluster_ID"))
## Rows: 189 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Cluster, Folds_keep, Cluster_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cluster_order <- cluster_meta %>% arrange(organ, L1_annot) %>% pull(Cluster_ChromBPNet)
organs <- unique(cluster_meta$organ) %>% sort()
organs[organs == "StomachEsophagus"] <- "Stomach"

head(chrombpnet_models_keep)
## # A tibble: 6 × 3
##   Cluster    Folds_keep                         Cluster_ID
##   <chr>      <chr>                              <chr>     
## 1 Adrenal_c0 fold_0,fold_1,fold_2,fold_3,fold_4 AG_0      
## 2 Adrenal_c1 fold_0,fold_1,fold_2,fold_3,fold_4 AG_1      
## 3 Adrenal_c2 fold_0,fold_1,fold_2,fold_3,fold_4 AG_2      
## 4 Adrenal_c3 fold_0,fold_1,fold_2,fold_3,fold_4 AG_3      
## 5 Brain_c0   fold_0,fold_1,fold_2,fold_3,fold_4 BR_0      
## 6 Brain_c1   fold_0,fold_1,fold_2,fold_3,fold_4 BR_1

Load metrics:

chrombpnet_metrics <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_metrics.tsv")) %>% 
  dplyr::select(Cluster_ChromBPNet, total_frags) %>% 
  distinct()
## Rows: 1015 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (5): organ, Cluster_ChromBPNet, L1_annot, Model, Fold
## dbl (13): number_of_cells, total_frags, counts_metrics.peaks.spearmanr, coun...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Load number of peaks:

npeaks <- read_table(here("output/03-chrombpnet/01-models/qc/npeaks.tsv"),
                   col_names = c("n_peaks", "peak_file")) %>% 
  filter(peak_file != "total")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   n_peaks = col_double(),
##   peak_file = col_character()
## )
nrow(npeaks) == 203
## [1] TRUE
npeaks <- npeaks %>% 
  rowwise() %>% 
  mutate(peak_file = basename(peak_file)) %>% 
  separate(peak_file, into = c("Cluster_ChromBPNet", "drop"), sep = "__") %>% 
  dplyr::select(-drop)

cluster_meta <- cluster_meta %>%
  left_join(npeaks, by = "Cluster_ChromBPNet") %>% 
  left_join(chrombpnet_metrics, by = "Cluster_ChromBPNet")

Load merged modisco reports:

modisco_merged <- read_tsv(here("output/03-chrombpnet/02-compendium/modisco_compiled_anno/modisco_merged_reports.tsv")) %>% 
  # get short cluster ID
  left_join(cluster_meta %>% dplyr::select(Cluster, Cluster_ChromBPNet),
            by = c("component_celltype" = "Cluster_ChromBPNet"))
## Rows: 6362 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): merged_pattern, component_celltype, pattern_class, pattern, compone...
## dbl (1): n_seqlets
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(modisco_merged)
## # A tibble: 6 × 7
##   merged_pattern              component_celltype pattern_class pattern n_seqlets
##   <chr>                       <chr>              <chr>         <chr>       <dbl>
## 1 neg.Average_12__merged_pat… Spleen_c5          neg_patterns  patter…        88
## 2 neg.Average_12__merged_pat… Spleen_c0          neg_patterns  patter…       461
## 3 neg.Average_12__merged_pat… Skin_c5            neg_patterns  patter…       302
## 4 neg.Average_12__merged_pat… Heart_c13          neg_patterns  patter…       128
## 5 neg.Average_12__merged_pat… Skin_c7            neg_patterns  patter…       708
## 6 neg.Average_12__merged_pat… Heart_c5           neg_patterns  patter…        47
## # ℹ 2 more variables: component_pattern <chr>, Cluster <chr>

Load annotated patterns:

# manually annotated motif patterns
motifs_compiled_all <- read_tsv("../02-compendium/04d-ChromBPNet_de_novo_motifs.tsv") %>% 
  mutate(motif_name = paste0(idx_uniq, "|", annotation)) %>% 
  dplyr::relocate(motif_name, .after = pattern) %>%
  dplyr::relocate(idx_uniq, .after = idx) %>% 
  # rename categories
  mutate(category = ifelse(annotation == "exclude", "exclude", category),
         category = case_match(category,
                               "composite_homo" ~ "homocomposite",
                               "composite_hetero" ~ "heterocomposite",
                               .default = category),
         category = factor(category, levels = names(cmap_category)),
         pattern_class = dplyr::recode(pattern_class, pos = "pos_patterns", neg = "neg_patterns"))
## Rows: 834 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): pattern_class, pattern, component_organs, component_celltypes, que...
## dbl (14): idx, total_n_seqlets, n_component_patterns, n_component_celltypes,...
## lgl  (6): modisco_cwm_fwd, modisco_cwm_rev, match0_logo, match0_logo_jolma, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
motifs_compiled <- motifs_compiled_all %>% 
  # filter out exclude motifs
  filter(category != "exclude") %>% 
  # extract the name of the known motif
  separate(match0, into = c("TF0", "motif0"), sep = ": ")

# for cases with multiple patterns merged into one motif, take the one with the most
# number of seqlets.
motifs_compiled_unique <- motifs_compiled %>% 
  group_by(idx_uniq) %>% 
  slice_max(n = 1, order_by = tibble(total_n_seqlets, n_component_celltypes),
            with_ties = FALSE)

dim(motifs_compiled_unique)
## [1] 508  43
# get positive and negative patterns
pos_patterns <- motifs_compiled_unique %>% filter(pattern_class == "pos_patterns") %>% pull(motif_name)
length(pos_patterns)
## [1] 493
neg_patterns <- motifs_compiled_unique %>% filter(pattern_class == "neg_patterns") %>% pull(motif_name)
length(neg_patterns)
## [1] 15
# initial breakdown of motif categories, prior manual collapsing
table(motifs_compiled_all$category)
## 
##            base base_with_flank   homocomposite heterocomposite      unresolved 
##             147             128             135             263              43 
##          repeat         partial         exclude 
##               7               4             107
# breakdown after manual collapsing & removing low-quality motifs
table(motifs_compiled_unique$category)
## 
##            base base_with_flank   homocomposite heterocomposite      unresolved 
##              86              51             107             222              38 
##          repeat         partial         exclude 
##               2               2               0
dim(motifs_compiled_unique)
## [1] 508  43

Thus, in total we have:

  • 508 unique motifs learned from the dataset
  • 493 positive patterns
  • 15 negative patterns

NOTE: There are 509 unique motifs, and the labels go from 0 to 508 (which spans 509 values). The “missing label” is for idx_uniq 466, which corresponded to unresolved motifs which were filtered out.

Load reconciled hit calls:

finemo_param <- "counts_v0.23_a0.8_all"

# load hit counts per motif per cell type
hits <- map(chrombpnet_models_keep$Cluster,
            ~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_motif.tsv")),
                                data.table = FALSE) %>%
              dplyr::rename(n = count) %>% 
              tibble::add_column(Cluster = .x, .before = 1)) %>%
  setNames(chrombpnet_models_keep$Cluster)

# sanity check
n_cols <- map_dbl(hits, ncol)

if (any(n_cols < 12)) {
  
  missing_cols <- names(n_cols[n_cols < 12])
  stop("@ Missing columns in ", glue_collapse(missing_cols, ", "))
  
}

saveRDS(hits, file = glue("{out}/hits.Rds"))

# load summary stats on hits per peak per cell type
hits_per_peak <- map_dfr(chrombpnet_models_keep$Cluster,
                         ~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_peak.tsv")),
                                             data.table = FALSE) %>%
                           tibble::add_column(Cluster = .x, .before = 1))

Calculate total number of hits per pattern per cell type:

hits_all <- bind_rows(hits)

if (!all(motifs_compiled_unique$motif_name %in% hits_all$motif_name) | !all(hits_all$motif_name %in% motifs_compiled$motif_name)) {
  
  stop("@ Motif names in annotation don't match motif names in hits.")
  
}

hits_all_totals <- hits_all %>%
  group_by(motif_name) %>%
  summarize(total_hits = sum(n))

# organ w/ most hits
hits_top_organ <- hits_all %>% 
  left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>% 
  group_by(motif_name, organ) %>% 
  summarize(hits_per_organ = sum(n)) %>% 
  group_by(motif_name) %>% 
  slice_max(n = 1, order_by = hits_per_organ, with_ties = FALSE) %>% 
  dplyr::select(motif_name, top_organ = organ)
## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'motif_name'. You can override using the
## `.groups` argument.
dim(hits_all_totals)
## [1] 508   2
dim(hits_top_organ)
## [1] 508   2
motifs_compiled_unique <- hits_all_totals %>% 
  left_join(motifs_compiled_unique, by = "motif_name") %>% 
  left_join(hits_top_organ, by = "motif_name") %>% 
  mutate(motif_name_safe = gsub("\\/|\\||\\#", ".", motif_name)) %>% 
  mutate(modisco_cwm_fwd = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.fwd.png")),
         modisco_cwm_rev = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.rev.png")))

# also subset to positive and neg patterns
motifs_compiled_unique_pos <- motifs_compiled_unique %>% 
  filter(pattern_class == "pos_patterns")

motifs_compiled_unique_neg <- motifs_compiled_unique %>% 
  filter(pattern_class == "neg_patterns")

dim(motifs_compiled_unique_pos)
## [1] 493  46
hits_all_anno <- hits_all %>% 
  left_join(cluster_meta %>% dplyr::rename(Cluster_short = Cluster), by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  left_join(motifs_compiled_unique %>% dplyr::select(motif_name, pattern, annotation_broad, category))
## Joining with `by = join_by(motif_name)`
head(hits_all_anno)
##      Cluster                           motif_name pattern_class      n Distal
## 1 Adrenal_c0                    392|NR:ESRRA/NR5A  pos_patterns 158975  52205
## 2 Adrenal_c0                         456|ZEB/SNAI  neg_patterns 122904  31275
## 3 Adrenal_c0                 132|BZIP:FOSL/JUND#1  pos_patterns  52506  17353
## 4 Adrenal_c0                           436|SP/KLF  pos_patterns  45828   6164
## 5 Adrenal_c0                           260|GATA#1  pos_patterns  33086  11270
## 6 Adrenal_c0 507|unresolved,repressive,YY1/2-like  neg_patterns  32037   8234
##   Exonic Intronic Promoter mean_distToTSS median_distToTSS
## 1   9379    84616    12775       20180.92           8859.0
## 2   8002    48500    35127       11567.57           2944.0
## 3   2527    28584     4042       20839.47           9456.0
## 4   2238     9426    28000        3203.65            140.0
## 5   1370    18616     1830       22932.57          10593.5
## 6   2288    12955     8560       12456.09           3584.0
##   mean_distToPeakSummit median_distToPeakSummit Cluster_short   organ
## 1                129.77                      67          AG_0 Adrenal
## 2                171.96                     128          AG_0 Adrenal
## 3                114.59                      50          AG_0 Adrenal
## 4                 88.78                      52          AG_0 Adrenal
## 5                128.59                      74          AG_0 Adrenal
## 6                150.93                     114          AG_0 Adrenal
##   organ_code cluster_id compartment              L1_annot
## 1         AG          0         epi AG_0_adrenal cortex 1
## 2         AG          0         epi AG_0_adrenal cortex 1
## 3         AG          0         epi AG_0_adrenal cortex 1
## 4         AG          0         epi AG_0_adrenal cortex 1
## 5         AG          0         epi AG_0_adrenal cortex 1
## 6         AG          0         epi AG_0_adrenal cortex 1
##                 L2_annot       L3_annot dend_order ncell median_numi
## 1 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 2 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 3 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 4 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 5 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 6 Adrenal adrenal cortex adrenal cortex          7   823        9498
##   median_ngene median_nfrags median_tss median_frip
## 1         3280          6784      9.832   0.2906086
## 2         3280          6784      9.832   0.2906086
## 3         3280          6784      9.832   0.2906086
## 4         3280          6784      9.832   0.2906086
## 5         3280          6784      9.832   0.2906086
## 6         3280          6784      9.832   0.2906086
##                                                                         note
## 1 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 2 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 3 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 4 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 5 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 6 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
##   organ_color compartment_color n_peaks total_frags
## 1     #876941           #11A579  121634     7407892
## 2     #876941           #11A579  121634     7407892
## 3     #876941           #11A579  121634     7407892
## 4     #876941           #11A579  121634     7407892
## 5     #876941           #11A579  121634     7407892
## 6     #876941           #11A579  121634     7407892
##                             pattern annotation_broad   category
## 1 pos.Average_256__merged_pattern_0               NR       base
## 2  neg.Average_12__merged_pattern_0         ZEB/SNAI       base
## 3 pos.Average_287__merged_pattern_0             BZIP       base
## 4 pos.Average_212__merged_pattern_0           SP/KLF       base
## 5 pos.Average_248__merged_pattern_0             GATA       base
## 6  neg.Average_15__merged_pattern_1       unresolved unresolved
# breakdown of motif categories per pos patterns
table(motifs_compiled_unique_pos$category)
## 
##            base base_with_flank   homocomposite heterocomposite      unresolved 
##              81              51             102             221              34 
##          repeat         partial         exclude 
##               2               2               0
# save as TSV for re-use
motifs_compiled_unique %>% write_tsv(glue("{out}/motifs_compiled_unique.tsv"))

# short version
motifs_compiled_unique %>%
  arrange(idx_uniq) %>% 
  dplyr::select(motif_name, total_hits, idx, idx_uniq, pattern_class, pattern,
                annotation, annotation_broad, category,
                TF0, motif0, qval0, top_organ, component_organs, component_celltypes) %>% 
  write_tsv(glue("{out}/motifs_compiled_unique_short.tsv"))

hits_all_anno %>% write_tsv(glue("{out}/hits_per_cluster_annotated.tsv"))

Prep the hit counts matrices across cell types:

dim(hits_all)
## [1] 33054    12
mat_hits <- hits_all %>%
  filter(pattern_class == "pos_patterns") %>% 
  left_join(motifs_compiled_unique_pos, by = c("motif_name", "pattern_class")) %>%
  filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
  dplyr::select(Cluster, motif_name, n) %>%
  spread(motif_name, n, fill = 0) %>%
  tibble::column_to_rownames(var = "Cluster") %T>%
  write_tsv(glue("{out}/hit_matrix.pos_patterns_no_unresolved.tsv"))

dim(mat_hits)
## [1] 189 455
hits_all_broad <- hits_all %>%
  filter(pattern_class == "pos_patterns") %>% 
  left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
  filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
  group_by(annotation_broad, Cluster) %>%
  summarize(n = sum(n))
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
mat_hits_broad <- hits_all_broad %>%
  spread(annotation_broad, n, fill = 0) %>%
  tibble::column_to_rownames(var = "Cluster") %T>%
  write_tsv(glue("{out}/hit_matrix.broad_pos_patterns_no_unresolved.tsv"))
dim(mat_hits_broad)
## [1] 189 169
mat_hits_unresolved <- hits_all %>%
  filter(pattern_class == "pos_patterns") %>% 
  left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
  filter(category == "unresolved") %>%
  dplyr::select(motif_name, n, Cluster) %>%
  spread(motif_name, n, fill = 0) %>%
  as.data.frame() %>%
  tibble::column_to_rownames(var = "Cluster")

dim(mat_hits_unresolved)
## [1] 189  34

Load hit counts for binned distances to nucleosomes:

# load motif hit counts per binned distance to nucleosome dyads
hit_dyad_dist <- map_dfr(chrombpnet_models_keep$Cluster,
                         ~ data.table::fread(
                           here(glue("output/03-chrombpnet/02-compendium/nucleoatac/{.x}/{.x}.motif_dist_to_dyad.tsv")),
                           data.table = FALSE) %>%
                           tibble::add_column(Cluster = .x, .before = 1))

hit_dyad_dist <- left_join(hit_dyad_dist, motifs_compiled_unique, by = "motif_name")

Load hit counts for binned distances to peak summits:

# load motif hit counts per binned distance to nucleosome dyads
hit_summit_dist <- map_dfr(chrombpnet_models_keep$Cluster,
                           ~ data.table::fread(
                             here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/counts_v0.23_a0.8_all/motif_dist_to_summit.tsv")),
                             data.table = FALSE) %>%
                             tibble::add_column(Cluster = .x, .before = 1))

hit_summit_dist <- left_join(hit_summit_dist, motifs_compiled_unique, by = "motif_name")

Load HOCOMOCO:

hocomoco <- read_tsv(here("output/02-global_analysis/04/hocomoco_unique.tsv"))
## Rows: 949 Columns: 662
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (45): name, original_motif.name, original_motif.tf, original_motif.moti...
## dbl (617): original_motif.subtype_info.datatype_counts.P, original_motif.sub...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique_classes <- unique(hocomoco$masterlist_info.tfclass_class)
cmap_class <- c(ArchR::paletteDiscrete(unique_classes, set = "stallion"), "N/A" = "gray90")
## Length of unique values greater than palette, interpolating..

Load CWMs for the compiled motifs:

# use custom helper
cwm_list <- read_cwm_meme(here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/modisco_compiled.memedb.txt")))
## Could not find strand info, assuming +.
length(cwm_list)
## [1] 834
cwm_list <- cwm_list[base::intersect(names(cwm_list), motifs_compiled_unique$pattern)]
length(cwm_list)
## [1] 508
names(cwm_list) <- plyr::mapvalues(names(cwm_list), from = motifs_compiled_unique$pattern, to = motifs_compiled_unique$motif_name)

# label matrix dimensions
cwm_list <- map(cwm_list, function(cwm) {
  
  rownames(cwm) <- c("A", "C", "G", "T")
  colnames(cwm) <- seq(1:30)
  
  return(cwm)
  
})

print(length(cwm_list))
## [1] 508
saveRDS(cwm_list, file = glue("{out}/denovo_cwm_list.Rds"))

3.1 Save intermediate objects

save(chrombpnet_models_keep, cluster_meta, motifs_compiled_unique, motifs_compiled_all,
     cwm_list, hit_dyad_dist, hit_summit_dist, hits_all_anno, hits_per_peak,
     file = glue("{out}/intermediate_obj.Rda"))

To reload:

load(glue("{out}/intermediate_obj.Rda"))
hits <- readRDS(glue("{out}/hits.Rds"))
hits_all <- bind_rows(hits)

3.2 Rename trimmed CWM logos:

fs::dir_create(file.path(out, "trimmed_cwm_logos"))

all(fs::file_exists(motifs_compiled_unique$modisco_cwm_fwd))
all(fs::file_exists(motifs_compiled_unique$modisco_cwm_rev))

for (i in 1:nrow(motifs_compiled_unique)) {
  
  new_name_fw <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.fwd.png")
  fs::file_copy(motifs_compiled_unique$modisco_cwm_fwd[i], file.path(out, "trimmed_cwm_logos", new_name_fw), overwrite = TRUE)
  
  new_name_rev <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.rev.png")
  fs::file_copy(motifs_compiled_unique$modisco_cwm_rev[i], file.path(out, "trimmed_cwm_logos", new_name_rev), overwrite = TRUE)
  
}

3.3 Prep table

motifs_compiled_unique %>% 
  mutate(
    class = case_when(
      pattern_class == "pos_patterns" ~ "Activating",
      TRUE ~ "Reducing"
    ),
    cwm_logo_fwd = paste0(motif_name_safe, "__cwm.fwd.png"),
    cwm_logo_rev = paste0(motif_name_safe, "__cwm.rev.png"),
  ) %>% 
  dplyr::select(
    idx_uniq,
    motif_name,
    motif_name_safe,
    class,
    total_instances_across_celltypes = total_hits,
    cwm_logo_fwd,
    cwm_logo_rev,
    query_consensus,
    annotation,
    annotation_broad,
    category,
    best_match_TF = TF0,
    best_match_motif = motif0,
    best_match_TOMTOM_qval = qval0,
    best_match_TFs_in_family = match0_TFs_in_family,
    best_match_TFs_in_Vierstra_archetype = match0_TFs_in_archetype,
    total_seqlets_across_celltypes = total_n_seqlets,
    merged_pattern = pattern,
    n_constituent_motifs = n_component_patterns,
    n_constituent_celltypes = n_component_celltypes,
    constituent_organs = component_organs,
    constituent_celltypes = component_celltypes
  ) %>% 
  arrange(idx_uniq) %>% 
  write_csv(glue("{out}/TABLE_motif_compendium.csv"))

4 Global view

4.1 Categories of motifs

What kinds of patterns did we detect?

First, let’s look at all 834 motifs from the motif clustering workflow.

p1 <- motifs_compiled_all %>% 
  group_by(category) %>% 
  count() %>% 
  # reverse levels since we're flipping coordinates
  mutate(category = factor(category, levels = rev(names(cmap_category)))) %>% 
  ggplot(aes(x = category, y = n)) +
  geom_col(aes(fill = category)) +
  geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
  coord_flip() +
  scale_fill_manual(values = cmap_category) +
  ggtitle("De novo deep learning derived motifs \n(after MoDISco agggregation step)") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90")) +
  ylim(c(0, 280))

p2 <- motifs_compiled_all %>% 
  ggplot(aes(x = "breakdown")) +
  geom_bar(aes(fill = category), position = "fill") +
  scale_fill_manual(values = cmap_category)

plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(0.65, 0.35))

Next, let’s look at the unique motifs after QC and collapsing motifs:

# stacked barplot
p1 <- motifs_compiled_unique %>% 
  ggplot(aes(x = "breakdown")) +
  geom_bar(aes(fill = category), position = "fill") +
  scale_fill_manual(values = cmap_category) +
  xlab(NULL) +
  theme(legend.position = "bottom") +
  no_legend()

# barplot of counts per category
p2 <- motifs_compiled_unique %>% 
  group_by(category) %>% 
  count() %>% 
  # reverse levels since we're flipping coordinates
  mutate(category = factor(category, levels = rev(names(cmap_category)))) %>% 
  ggplot(aes(x = category, y = n)) +
  geom_col(aes(fill = category)) +
  geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
  coord_flip() +
  scale_fill_manual(values = cmap_category) +
  ggtitle("De novo deep learning derived motifs \n(after manual collapsing & removing low quality motifs)") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90")) +
  ylim(c(0, 280))


plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(1, 4, 3.5))

4.2 Positive and negative patterns

p1 <- motifs_compiled_unique %>% 
  group_by(pattern_class) %>% 
  count() %>% 
  ggplot(aes(x = "1", y = n)) +
  geom_col(aes(fill = pattern_class), alpha = 0.4) +
  coord_flip() +
  scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
  ggtitle("De novo deep learning derived motifs") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90"))

p2 <- hits_all %>%
  left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>% 
  group_by(organ, pattern_class) %>%
  summarize(n_hits = sum(n)) %>%
  mutate(organ = factor(organ, levels = rev(unique(.$organ)))) %>% 
  ggplot(aes(x = organ, y = n_hits)) +
  geom_col(aes(fill = pattern_class), alpha = 0.4, position = "fill") +
  coord_flip() +
  scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
  ggtitle("Breakdown of total hits") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90"))
## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'organ'. You can override using the
## `.groups` argument.
plot_grid(p1, p2, ncol = 1, align = "v", axis = "rl", rel_heights = c(1, 4))

4.3 Similarity to known motifs

Distribution of TOMTOM q-values for similarity to known motifs, and total # hits per motif:

p1 <- motifs_compiled_unique %>%
  mutate(similarity = qval0) %>%
  ggplot(aes(x = category, y = -log10(qval0))) +
  geom_boxplot(aes(fill = category), outliers = FALSE) +
  geom_jitter(shape = 21, width = 0.2, size = 0.5) +
  scale_fill_manual(values = cmap_category) +
  geom_hline(yintercept = -log10(0.05)) +
  no_legend() +
  xlab(NULL) +
  rotate_x()

p2 <- motifs_compiled_unique %>% 
  ggplot(aes(x = category, y = log10(total_hits))) +
  geom_boxplot(aes(fill = category), outlier.colour = NA) +
  geom_jitter(color = "black", width = 0.2, size = 0.5, shape = 21) +
  scale_fill_manual(values = cmap_category) +
  rotate_x() +
  no_legend() +
  # ggtitle("Total # of hits per unique motif \neach point is a motif (n=525)") +
  coord_cartesian(ylim = c(1, 7))

plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb")

Total number of hits per cell type:

hits_all %>%
  group_by(Cluster, pattern_class) %>%
  summarize(total_hits = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>% 
  ggplot(aes(x = Cluster, y = total_hits)) +
  facet_wrap(~ pattern_class, ncol = 1) +
  geom_bar(stat = "identity", aes(fill = organ)) +
  scale_fill_manual(values = cmap_organ) +
  rotate_x() +
  ylab("Total number of hits") +
  theme(
    strip.background = element_blank(),
    axis.text.x = element_text(size = 5),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Total hits per cell type")
## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.

4.4 Total hits

By class (positive/negative patterns):

hits_all %>%
  left_join(motifs_compiled_unique %>% dplyr::select(-pattern_class),
            by = c("motif_name")) %>%
  group_by(Cluster, pattern_class) %>%
  summarize(n_per_class = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>% 
  ggplot(aes(x = Cluster, y = n_per_class)) +
  geom_bar(stat = "identity", aes(fill = pattern_class), alpha = 0.4, position = "fill") +
  scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
  rotate_x() +
  ylab("Total number of hits") +
  theme(
    axis.text.x = element_text(size = 5),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Total hits, by pattern class")
## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.

Let’s plot total hits and mean hits per peak, grouped by organ:

p1 <- hits_all %>% 
  group_by(Cluster, pattern_class) %>%
  summarize(total_hits = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster)),
         pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>% 
  ggplot(aes(x = organ, y = total_hits)) +
  geom_jitter(stat = "identity", aes(fill = organ), shape = 21, color = "white", size = 3, alpha = 0.6, width = 0.25) +
  scale_fill_manual(values = cmap_organ) +
  rotate_x() +
  facet_wrap(~ pattern_class, ncol = 1) +
  ylab("Total number of hits") +
  theme(
    strip.background = element_blank(),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Total hits (+ patterns)") +
  no_legend()
## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
p2 <- hits_per_peak %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>% 
  gather(pattern_class, median_n_hits, median_pos_patterns, median_neg_patterns) %>% 
  mutate(pattern_class = factor(pattern_class, levels = c("median_pos_patterns", "median_neg_patterns"))) %>% 
  ggplot(aes(x = organ, y = median_n_hits)) +
  ggbeeswarm::geom_quasirandom(
    aes(fill = organ), shape = 21, color = "white", size = 3, alpha = 0.6) + #, width = 0.25, height = 0) +
  scale_fill_manual(values = cmap_organ) +
  facet_wrap(~ pattern_class, ncol = 1) +
  rotate_x() +
  ylab("Median number of hits per peak (+ patterns)") +
  theme(
    strip.background = element_blank(),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(breaks = seq(0, 10, 1), labels = seq(0, 10), limits = c(0, 9)) +
  no_legend() +
  ggtitle("Median hits per peak (+ patterns)")

plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb")

4.5 Effects of coverage/peaks on number of hits

Here we look at how total number of motif instances varies with total fragments and number of peaks:

# calculate total hist per cluster (incl. both pos/neg patterns)
total_hits_per_cluster <- hits_all %>% 
  group_by(Cluster) %>%
  summarize(total_hits = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))

# this is for the 189 models which passed QC
total_hits_per_cluster$Cluster %>% unique %>% length
## [1] 189
summary(total_hits_per_cluster$total_hits)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   73500  457034  664835  839544 1110354 2593062
cor_frags <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
summary(cor_frags)
## 
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1266989  -240087   -88317   214846  1610838 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.052e+05  3.600e+04   14.03   <2e-16 ***
## total_hits_per_cluster$total_frags 8.947e-03  6.326e-04   14.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 373300 on 187 degrees of freedom
## Multiple R-squared:  0.5169, Adjusted R-squared:  0.5143 
## F-statistic:   200 on 1 and 187 DF,  p-value: < 2.2e-16
cor_peaks <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
summary(cor_peaks)
## 
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -555859  -99658    6559  109954  652260 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.796e+05  3.305e+04  -5.435 1.69e-07 ***
## total_hits_per_cluster$n_peaks  8.802e+00  2.567e-01  34.294  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 198900 on 187 degrees of freedom
## Multiple R-squared:  0.8628, Adjusted R-squared:  0.8621 
## F-statistic:  1176 on 1 and 187 DF,  p-value: < 2.2e-16
p1 <- total_hits_per_cluster %>% 
  ggplot(aes(x = log10(total_frags), y = total_hits)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = label_number(scale = 1/1e6)) +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
           x = 7, y = 2e6, size = 5) +
  xlab("log10(Total fragments)") + ylab("Total hits (x 1M)") +
  square() +
  ggtitle("# motif instances vs. \n# fragments") +
  no_legend()

p2 <- total_hits_per_cluster %>% 
  ggplot(aes(x = n_peaks, y = total_hits)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = label_number(scale = 1/1e6)) +
  scale_x_continuous(labels = scales::comma) +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
           x = 7e4, y = 2e6, size = 5) +
  xlab("Total peaks") + ylab("Total hits (x 1M)") +
  square() +
  ggtitle("# motif instances vs. \n# peaks") +
  no_legend()

# median number of instances for positive motifs per peaks
n_hits_per_peaks <- hits_per_peak %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))

cor_frags <- lm(n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
summary(cor_frags)
## 
## Call:
## lm(formula = n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0016 -0.9635 -0.0603  0.7646  3.3597 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  3.942e+00  9.692e-02   40.67  < 2e-16 ***
## n_hits_per_peaks$total_frags 8.634e-09  1.703e-09    5.07 9.52e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.005 on 187 degrees of freedom
## Multiple R-squared:  0.1209, Adjusted R-squared:  0.1162 
## F-statistic: 25.71 on 1 and 187 DF,  p-value: 9.516e-07
cor_peaks <- lm(n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
summary(cor_peaks)
## 
## Call:
## lm(formula = n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.73318 -0.57254 -0.00856  0.49256  2.90880 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.991e+00  1.449e-01  20.645   <2e-16 ***
## n_hits_per_peaks$n_peaks 1.100e-05  1.125e-06   9.779   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8717 on 187 degrees of freedom
## Multiple R-squared:  0.3383, Adjusted R-squared:  0.3348 
## F-statistic: 95.63 on 1 and 187 DF,  p-value: < 2.2e-16
p3 <- n_hits_per_peaks %>% 
  ggplot(aes(x = log10(total_frags), y = median_pos_patterns)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  no_legend() +
  square() +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
           x = 7.5, y = 7, size = 5) +
  xlab("log10(Total fragments)") + ylab("median # positive hits per peak") +
  ggtitle("median positive hits per peak vs. \n# fragments")

p4 <- n_hits_per_peaks %>% 
  ggplot(aes(x = n_peaks, y = median_pos_patterns)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(labels = scales::comma) +
  no_legend() +
  square() +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
           x = 70000, y = 7, size = 5) +
  xlab("total number of peaks") + ylab("median # positive hits per peak") +
  ggtitle("median positive hits per peak vs. \n# peaks")

# calculate quartiles
n_hits_per_peaks$n_peak_quartile <- ntile(n_hits_per_peaks$n_peaks, n=4)
table(n_hits_per_peaks$n_peak_quartile)/sum(table(n_hits_per_peaks$n_peak_quartile))
## 
##         1         2         3         4 
## 0.2539683 0.2486772 0.2486772 0.2486772
n_hits_per_peaks$total_frags_quartile <- ntile(n_hits_per_peaks$total_frags, n=4)
table(n_hits_per_peaks$total_frags_quartile)/sum(table(n_hits_per_peaks$total_frags_quartile))
## 
##         1         2         3         4 
## 0.2539683 0.2486772 0.2486772 0.2486772
p5 <- n_hits_per_peaks %>% 
  ggplot(aes(x = total_frags_quartile, y = median_pos_patterns, group = total_frags_quartile)) +
  geom_boxplot(fill = "gray90", outliers = FALSE) +
  ggbeeswarm::geom_quasirandom(
    aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
  scale_fill_manual(values = cmap_organ) +
  no_legend() +
  ylab("Median # positive hits per peak") +
  xlab("Quantile (total number of fragments)") +
  ggtitle("median positive hits per peak vs. \n# fragments")

p6 <- n_hits_per_peaks %>% 
  ggplot(aes(x = n_peak_quartile, y = median_pos_patterns, group = n_peak_quartile)) +
  geom_boxplot(fill = "gray90", outliers = FALSE) +
  ggbeeswarm::geom_quasirandom(
    aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
  scale_fill_manual(values = cmap_organ) +
  no_legend() +
  ylab("Median # positive hits per peak") +
  xlab("Quantile (total number of peaks)") +
  ggtitle("median positive hits per peak vs. \n# peaks")

plot_grid(p1, p3, p5, p2, p4, p6, align = "hv", axis = "rltb")

5 Genomic localization

5.1 Nucleosome positioning

First we need to aggregate the counts per bin over motif variants and cell types, per category:

# total counts
motifs_keep <- hit_dyad_dist %>% 
  filter(category %in% c("base", "base_with_flank")) %>% 
  group_by(annotation_broad) %>% 
  summarize(sum_all_counts = sum(n)) %>%
  arrange(sum_all_counts) %>% 
  filter(sum_all_counts > 50000) %>%
  pull(annotation_broad)

motifs_keep %>% length
## [1] 40
medians_dist_dyad <- hit_dyad_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist) %>% 
  summarize(sum_counts = sum(n)) %>% 
  # we need to actually calculate the median given the binned counts
  slice_max(order_by = sum_counts, n = 1) %>% 
  arrange(bin_dist)
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
# z-score
hit_dyad_dist_summed <- hit_dyad_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist) %>% 
  summarize(sum_counts = sum(n)) %>% 
  # zscore counts per motif (row-wise)
  mutate(zscore = scale(sum_counts)) %>% 
  mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
         bin_dist = as.numeric(bin_dist)) %>% 
  ungroup()
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
p1 <- hit_dyad_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = zscore)) +
  scale_fill_gradientn(colours = rdbu2,
                       # set midpoint to 0
                       # https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
                       rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
  geom_vline(xintercept = 75, linewidth = 1) +
  scale_x_continuous(breaks = seq(10, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from nucleosome dyad (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")

We can plot the heatmap w/ the actual counts on top so that we can manually inspect:

hit_dyad_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = zscore)) +
  geom_text(aes(label = round(sum_counts, 2)), color = "black") +
  scale_fill_gradientn(colours = rdbu2,
                       # set midpoint to 0
                       # https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
                       rescaler = ~ scales::rescale_mid(.x, mid = 0)) +
  geom_vline(xintercept = 75, linewidth = 1) +
  scale_x_continuous(breaks = seq(10, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from nucleosome dyad (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")

5.2 Distance to summits

hit_summit_dist_summed <- hit_summit_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist) %>% 
  summarize(sum_counts = sum(n)) %>% 
  filter(bin_dist <= 250) %>% 
  # zscore counts per motif (row-wise)
  mutate(zscore = scale(sum_counts)) %>% 
  mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
         bin_dist = as.numeric(bin_dist)) %>% 
  ungroup()
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
hit_summit_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = sum_counts)) +
  scale_fill_viridis_c("plasma") +
  scale_x_continuous(breaks = seq(0, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from peak summit (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")

p2 <- hit_summit_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = zscore)) +
  scale_fill_gradientn(colours = rdbu2,
                       # set midpoint to 0
                       # https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
                       rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
  scale_x_continuous(breaks = seq(0, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from peak summit (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")
hit_summit_dist_summed <- hit_summit_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist, pattern_class) %>% 
  summarize(sum_counts = sum(n))
## `summarise()` has grouped output by 'annotation_broad', 'bin_dist'. You can
## override using the `.groups` argument.
hit_summit_dist_summed %>%
  filter(annotation_broad != "unresolved") %>% 
  group_by(annotation_broad) %>%
  mutate(cum_sum_counts = cumsum(sum_counts)) %>%
  mutate(pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>% 
  ggplot(aes(x = bin_dist, y = log10(cum_sum_counts), group = annotation_broad)) +
  # geom_point(aes(fill = pattern_class), alpha = 0.4, color = "white", shape = 21) +
  geom_line(aes(color = pattern_class), alpha = 0.4) + 
  facet_wrap(~ pattern_class, ncol = 1) +
  scale_color_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
  scale_fill_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
  theme(panel.grid.minor.x = element_line(color = "gray90"),
        panel.grid.major.x = element_line(color = "gray90"),
        strip.background.x = element_blank()) +
  no_legend()

hit_summit_dist_coarse <- hit_summit_dist %>% 
  dplyr::select(Cluster, motif_name, annotation_broad, category, pattern_class, bin_dist, n_per_cluster_per_bin = n) %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  mutate(bin_dist_coarse = case_when(
    bin_dist <= 50 ~ "0-50 bp",
    bin_dist <= 100 ~ "51-100 bp",
    bin_dist <= 150 ~ "101-150 bp",
    bin_dist <= 200 ~ "151-200 bp",
    bin_dist <= 250 ~ "201-250 bp",
    TRUE ~ "> 251 bp"
  )) %>% 
  group_by(annotation_broad, pattern_class, bin_dist_coarse) %>% 
  summarize(n_per_bin = sum(n_per_cluster_per_bin)) %>% 
  mutate(n_total = sum(n_per_bin),
         prop_ber_bin = n_per_bin / n_total)
## `summarise()` has grouped output by 'annotation_broad', 'pattern_class'. You
## can override using the `.groups` argument.
cmap_bins <- RColorBrewer::brewer.pal(6, "BuGn")
names(cmap_bins) <- rev(c("0-50 bp", "51-100 bp", "101-150 bp", "151-200 bp", "201-250 bp", "> 251 bp"))

motif_order_dist <- hit_summit_dist_coarse %>% filter(bin_dist_coarse == "0-50 bp") %>% arrange(desc(prop_ber_bin)) %>% pull(annotation_broad) %>% unique()

hit_summit_dist_coarse %>% 
  mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = names(cmap_bins)),
         annotation_broad = factor(annotation_broad, levels = motif_order_dist)) %>% 
  ggplot(aes(x = annotation_broad, y = prop_ber_bin)) +
  geom_bar(stat = "identity", aes(fill = bin_dist_coarse)) +
  scale_fill_manual(values = cmap_bins) +
  rotate_x()

Get proportions of motifs per bin:

hit_summit_dist_coarse %>% 
  mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = rev(names(cmap_bins)))) %>% 
  group_by(bin_dist_coarse) %>% 
  summarize(n_per_bin_across_motifs = sum(n_per_bin)) %>% 
  mutate(n_total = sum(n_per_bin_across_motifs),
         prop = n_per_bin_across_motifs / n_total,
         cum_n = cumsum(n_per_bin_across_motifs),
         cum_prop = cum_n / n_total) %>% 
  arrange(bin_dist_coarse)
## # A tibble: 6 × 6
##   bin_dist_coarse n_per_bin_across_motifs   n_total   prop     cum_n cum_prop
##   <fct>                             <int>     <int>  <dbl>     <int>    <dbl>
## 1 0-50 bp                        43674790 120645979 0.362   43674790    0.362
## 2 51-100 bp                      21467522 120645979 0.178   65142312    0.540
## 3 101-150 bp                     13494671 120645979 0.112   78636983    0.652
## 4 151-200 bp                      9438266 120645979 0.0782  88075249    0.730
## 5 201-250 bp                      7483429 120645979 0.0620  95558678    0.792
## 6 > 251 bp                       25087301 120645979 0.208  120645979    1

5.3 Feature type

region_type_summed <- hits_all_anno %>% 
  filter(annotation_broad %in% motifs_keep) %>% 
  dplyr::select(motif_name, annotation_broad, Distal, Exonic, Intronic, Promoter) %>% 
  gather(region_type, count, 3:ncol(.)) %>% 
  group_by(annotation_broad, region_type) %>% 
  summarize(count_total = sum(count, na.rm = TRUE)) %>% 
  mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)))
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
p3 <- region_type_summed %>% 
  ggplot(aes(x = annotation_broad, y = count_total)) +
  geom_col(aes(fill = region_type), position = "fill", alpha = 0.7) +
  scale_fill_manual(values = cmap_peaktype2) +
  coord_flip() +
  theme(legend.position = "bottom")

plot_grid(p1, p2 , p3, nrow = 1, align = "h", axis = "tb", rel_widths = c(2, 2, 1))

6 Summary plots of discovered motifs

Load known motifs from Vierstra input set:

vierstra_motifs <- read_cwm_meme(file.path(kundaje_dir, "refs/Vierstra_motifs/all.dbs.meme"))

We want to describe motifs in a given subset. This plots:

  • The de novo CWM (trimmed)
  • The PWM for the best matching known motif in the Vierstra input database
  • Total # of hits across cell types
  • Number of motif variants within a category
  • Breakdown of hits by compartment
  • Breakdown of hits by organ
  • Genomic annotations
  • Binned distances to nucleosomes

6.1 Base motifs, representative per broad anno

base_broad <- motifs_compiled_unique %>% 
  filter(category %in% c("base", "base_with_flank"))

length(base_broad$annotation_broad %>% unique())
## [1] 51
plot_motif_summary(base_broad,
                   hits_all_anno,
                   hit_dyad_dist,
                   order_by = "distToTSS",
                   plot_known_motifs = TRUE,
                   # manually indicate which motifs to reverse complement
                   # to match known motif orientation
                   to_revcomp = c("GATA", "CTCF", "RFX", "MEF2", "NFKB/RELA", "NR", "ONECUT",
                                  "ZNF143", "NR:HNF4", "REST", "THRA/B", "HD:SIX/ZNF", "ZEB/SNAI", "YY1/2rep"))
## @ plotting 51 motifs
## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.2 Homocomposites, representative motif per broad anno

homocomp_broad <- motifs_compiled_unique %>%
  filter(pattern_class == "pos_patterns" & category == "homocomposite")

length(unique(homocomp_broad$annotation_broad))
## [1] 22
plot_motif_summary(homocomp_broad,
                   hits_all_anno,
                   hit_dyad_dist,
                   plot_known_motifs = TRUE,
                   rel_widths = c(3.5, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
                   to_revcomp = c("ETS_ETS", "IRF/STAT_IRF/STAT", "BZIP_BZIP", "HD:PITX/OTX_HD:PITX/OTX"))
## @ plotting 22 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.3 Heterocomposites, representative motif per broad anno

Just showing the top 30 here, because there are many.

heterocomp_broad <- motifs_compiled_unique %>% 
  filter(category == "heterocomposite" & !grepl("unresolved", annotation))

heterocomp_broad %>% 
  group_by(annotation_broad) %>% 
  count() %>% 
  arrange(desc(n))
## # A tibble: 95 × 2
## # Groups:   annotation_broad [95]
##    annotation_broad     n
##    <chr>            <int>
##  1 BHLH_HD             11
##  2 ETS_SP/KLF           9
##  3 ETS_NR               8
##  4 FOX_NKX              8
##  5 ETS_SOX              7
##  6 BHLH_BZIP            6
##  7 BHLH_MEF2            6
##  8 BHLH_NFI             6
##  9 ETS_NFI              6
## 10 GATA_MEF2            5
## # ℹ 85 more rows
plot_motif_summary(heterocomp_broad,
                   hits_all_anno,
                   hit_dyad_dist,
                   plot_known_motifs = FALSE,
                   # known_motifs = jolma_motifs,
                   # known_motif_col = "match0_jolma",
                   # known_motif_name = "match0_jolma",
                   rel_widths = c(3, 0.5, 1, 1, 1, 1, 1, 1.5, 1.5, 1),
                   top_n = 30)
## @ plotting 30 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.4 Negative/reducing patterns, representative per broad anno

neg_patterns <- motifs_compiled_unique %>% 
  filter(pattern_class == "neg_patterns") %>% 
  filter(!(category == "unresolved"))

# we can make use of the plotting function by setting the broad annotation to 
# the granular one
plot_motif_summary(neg_patterns %>% mutate(annotation_broad = motif_name),
                   hits_all_anno %>% mutate(annotation_broad = motif_name),
                   hit_dyad_dist %>% mutate(annotation_broad = motif_name),
                   plot_known_motifs = TRUE,
                   rel_widths = c(4, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
                   to_revcomp = c("456|ZEB/SNAI", "455|YY1/2repressive"))
## @ plotting 11 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.5 Unresolved motifs

unresolved <- motifs_compiled_unique %>% 
  filter(category == "unresolved") %>% 
  arrange(desc(total_hits))

plot_motif_summary(unresolved %>% mutate(annotation_broad = motif_name),
                   hits_all_anno %>% mutate(annotation_broad = motif_name), 
                   hit_dyad_dist %>% mutate(annotation_broad = motif_name),
                   order_by = "distToTSS",
                   plot_known_motifs = FALSE,
                   rel_widths = c(3.5, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
                   top_n = 30)
## @ plotting 30 motifs
## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

7 Session info

.libPaths()
## [1] "/oak/stanford/groups/wjg/sjessa/projects/HDMA/firstpass/renv/library/R-4.1/x86_64-pc-linux-gnu"
## [2] "/share/software/user/open/R/4.1.2/lib64/R/library"                                             
## [3] "/oak/stanford/groups/akundaje/sjessa/software/R"
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_4.2.4         RColorBrewer_1.1-3     ggrastr_1.0.1         
##  [4] magrittr_2.0.3         GenomicFeatures_1.46.5 AnnotationDbi_1.56.2  
##  [7] Biobase_2.54.0         GenomicRanges_1.46.1   GenomeInfoDb_1.30.1   
## [10] IRanges_2.28.0         S4Vectors_0.32.4       BiocGenerics_0.40.0   
## [13] ggseqlogo_0.1          plotly_4.10.4          cowplot_1.1.3         
## [16] ggrepel_0.9.3          stringr_1.5.0          purrr_1.0.2           
## [19] readr_2.1.4            tidyr_1.3.0            dplyr_1.1.3           
## [22] ggplot2_3.5.1          scales_1.3.0           glue_1.8.0            
## [25] here_1.0.1            
## 
## loaded via a namespace (and not attached):
##  [1] ggbeeswarm_0.7.2            colorspace_2.1-0           
##  [3] rjson_0.2.21                rprojroot_2.0.3            
##  [5] XVector_0.34.0              rstudioapi_0.15.0          
##  [7] farver_2.1.1                bit64_4.0.5                
##  [9] fansi_1.0.4                 xml2_1.3.5                 
## [11] cachem_1.0.8                knitr_1.43                 
## [13] jsonlite_1.8.7              Rsamtools_2.10.0           
## [15] dbplyr_2.3.3                png_0.1-8                  
## [17] compiler_4.1.2              httr_1.4.7                 
## [19] Matrix_1.6-3                fastmap_1.1.1              
## [21] lazyeval_0.2.2              cli_3.6.1                  
## [23] htmltools_0.5.6             prettyunits_1.1.1          
## [25] tools_4.1.2                 gtable_0.3.4               
## [27] GenomeInfoDbData_1.2.7      rappdirs_0.3.3             
## [29] Rcpp_1.0.11                 jquerylib_0.1.4            
## [31] vctrs_0.6.3                 Biostrings_2.62.0          
## [33] rtracklayer_1.54.0          xfun_0.40                  
## [35] lifecycle_1.0.3             restfulr_0.0.15            
## [37] gtools_3.9.4                XML_3.99-0.14              
## [39] zlibbioc_1.40.0             MASS_7.3-60                
## [41] vroom_1.6.3                 hms_1.1.3                  
## [43] MatrixGenerics_1.6.0        parallel_4.1.2             
## [45] SummarizedExperiment_1.24.0 yaml_2.3.7                 
## [47] curl_5.0.2                  gridExtra_2.3              
## [49] memoise_2.0.1               sass_0.4.7                 
## [51] biomaRt_2.50.3              stringi_1.7.12             
## [53] RSQLite_2.3.1               highr_0.10                 
## [55] BiocIO_1.4.0                universalmotif_1.12.4      
## [57] ArchR_1.0.2                 filelock_1.0.2             
## [59] BiocParallel_1.28.3         rlang_1.1.1                
## [61] pkgconfig_2.0.3             bitops_1.0-7               
## [63] matrixStats_1.0.0           evaluate_0.21              
## [65] lattice_0.20-45             GenomicAlignments_1.30.0   
## [67] htmlwidgets_1.6.2           labeling_0.4.3             
## [69] bit_4.0.5                   tidyselect_1.2.0           
## [71] plyr_1.8.8                  R6_2.5.1                   
## [73] generics_0.1.3              DelayedArray_0.20.0        
## [75] DBI_1.1.3                   pillar_1.9.0               
## [77] withr_2.5.0                 KEGGREST_1.34.0            
## [79] RCurl_1.98-1.12             tibble_3.2.1               
## [81] crayon_1.5.2                utf8_1.2.3                 
## [83] BiocFileCache_2.2.1         tzdb_0.4.0                 
## [85] rmarkdown_2.24              viridis_0.6.4              
## [87] progress_1.2.2              data.table_1.14.8          
## [89] blob_1.2.4                  digest_0.6.33              
## [91] munsell_0.5.0               beeswarm_0.4.0             
## [93] viridisLite_0.4.2           vipor_0.4.5                
## [95] bslib_0.5.1